In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

Data Exploration and Cleaning¶

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
df.head()
Out[3]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

In [4]:
df.shape
Out[4]:
(1460, 81)
In [5]:
# Check null value
df_na = df.isna().sum()
df_na[df_na > 0]
Out[5]:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

Transform Hierarchy Categorical Data¶

In [6]:
# ExterQual
mapping = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
df['ExterQual'] = df['ExterQual'].map(mapping)
df['ExterCond'] = df['ExterCond'].map(mapping)
df['HeatingQC'] = df['HeatingQC'].map(mapping)
df['KitchenQual'] = df['KitchenQual'].map(mapping)
df['BsmtQual'] = df['BsmtQual'].map(mapping)
df['BsmtCond'] = df['BsmtCond'].map(mapping)
df['FireplaceQu'] = df['FireplaceQu'].map(mapping)
df['PoolQC'] = df['PoolQC'].map(mapping)


mapping_2 = {'Gd': 3,'Av': 2, 'Mn': 1}
df['BsmtExposure'] = df['BsmtExposure'].map(mapping_2)

mapping_3 = {'GLQ': 6,'ALQ': 5, 'BLQ': 4,'Rec': 3,'LwQ': 2, 'Unf': 1}
df['BsmtFinType1'] = df['BsmtFinType1'].map(mapping_3)
df['BsmtFinType2'] = df['BsmtFinType2'].map(mapping_3)

mapping_4 = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
df['GarageQual'] = df['GarageQual'].map(mapping_4)
df['GarageCond'] = df['GarageCond'].map(mapping_4)

mapping_5 = {'N': 0, 'Y': 1}
df['CentralAir'] = df['CentralAir'].map(mapping_5)

mapping_6 = {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1}
df['LotShape'] = df['LotShape'].map(mapping_6)

mapping_7 = {'Gtl': 3, 'Mod': 2, 'Sev': 1}
df['LandSlope'] = df['LandSlope'].map(mapping_7)
In [7]:
df.shape
Out[7]:
(1460, 81)

Deal With Missing Value¶

LotFrontage¶
In [8]:
sns.boxplot(x=df['LotFrontage'])
plt.title('LotFrontage (with outliers)')
plt.show()
In [9]:
df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace =True)
Alley¶
In [10]:
df.Alley.describe
Out[10]:
<bound method NDFrame.describe of 0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1455    NaN
1456    NaN
1457    NaN
1458    NaN
1459    NaN
Name: Alley, Length: 1460, dtype: object>
In [11]:
df['Alley'].fillna('NA', inplace =True)
MasVnrType¶
In [12]:
df.MasVnrType
Out[12]:
0       BrkFace
1          None
2       BrkFace
3          None
4       BrkFace
         ...   
1455       None
1456      Stone
1457       None
1458       None
1459       None
Name: MasVnrType, Length: 1460, dtype: object
In [13]:
df['MasVnrType'].fillna('None', inplace =True)
MasVnrArea¶
In [14]:
df['MasVnrArea'].fillna(0, inplace =True)
BsmtQual¶
In [15]:
df['BsmtQual'].fillna(df['BsmtQual'].mean(), inplace =True)
BsmtCond¶
In [16]:
df['BsmtCond'].fillna(df['BsmtCond'].mean(), inplace =True)
BsmtExposure¶
In [17]:
df['BsmtExposure'].fillna(df['BsmtExposure'].mean(), inplace =True)
BsmtFinType1¶
In [18]:
df['BsmtFinType1'].fillna(df['BsmtFinType1'].mean(), inplace =True)
BsmtFinType2¶
In [19]:
df['BsmtFinType2'].fillna(df['BsmtFinType2'].mean(), inplace =True)
Electrical¶
In [20]:
sns.displot(df['Electrical'])
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x127f27890>
In [21]:
df['Electrical'].fillna('None', inplace =True)
FireplaceQu¶
In [22]:
df.FireplaceQu.head()
Out[22]:
0    NaN
1    3.0
2    3.0
3    4.0
4    3.0
Name: FireplaceQu, dtype: float64
In [23]:
df['FireplaceQu'].fillna(df['FireplaceQu'].mean(), inplace =True)
Garage¶
In [24]:
df['GarageType'].fillna('None', inplace =True)
In [25]:
df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean(), inplace =True)
In [26]:
df['GarageFinish'].fillna('None', inplace =True)
In [27]:
df['GarageQual'].fillna('None', inplace =True)
In [28]:
df['GarageCond'].fillna('None', inplace =True)

PoolQC¶

In [29]:
df['PoolQC'].fillna(df['PoolQC'].mean(), inplace =True)
Fence¶
In [30]:
df['Fence'].fillna('None', inplace =True)
MiscFeature¶
In [31]:
df['MiscFeature'].fillna('None', inplace =True)
In [32]:
df_na = df.isna().sum()
df_na[df_na > 0]
Out[32]:
Series([], dtype: int64)
In [33]:
df_clean = df

Transform to yearold¶

In [34]:
df_clean['GarageYrBlt'].dtype
df_clean['GarageYrBlt'] = 2023 - df_clean['GarageYrBlt']
df_clean['YearBuilt'] = 2023 - df_clean['YearBuilt']
df_clean['YearRemodAdd'] = 2023 - df_clean['YearRemodAdd']
df_clean['YrSold'] = 2023 - df_clean['YrSold']
In [35]:
df_clean['YearBuilt'].head()
Out[35]:
0     20
1     47
2     22
3    108
4     23
Name: YearBuilt, dtype: int64
In [36]:
df_clean.rename(columns={'GarageYrBlt': 'GarageAge'}, inplace=True)
df_clean.rename(columns={'YearBuilt': 'YearBuiltAge'}, inplace=True)
df_clean.rename(columns={'YearRemodAdd': 'YearRemodAddAge'}, inplace=True)
df_clean.rename(columns={'YrSold': 'YrSoldAge'}, inplace=True)
In [37]:
#df_clean['Log_GarageAge'] = np.log(df_clean['GarageAge'])
In [38]:
#df_clean['Log_YearBuiltAge'] = np.log(df_clean['YearBuiltAge'])
In [39]:
#df_clean['Log_YearRemodAddAge'] = np.log(df_clean['YearRemodAddAge'])
In [40]:
#df_clean['Log_YrSoldAge'] = np.log(df_clean['YrSoldAge'])
In [41]:
#df_clean.drop(columns=['GarageAge', 'YearBuiltAge', 'YearRemodAddAge', 'YrSoldAge'])
In [42]:
df_clean['SalePrice'] = np.log(df_clean['SalePrice'])

Conduct PCA to Reduce Dimention for Numerical Variables¶

In [43]:
df_clean.shape
Out[43]:
(1460, 81)
In [44]:
numerical_df_clean = df_clean.select_dtypes(include='number')
In [45]:
numerical_df_clean.shape
Out[45]:
(1460, 52)
In [46]:
numerical_df_clean = numerical_df_clean.drop(columns=['Id', 'SalePrice'], axis=1)
In [47]:
numerical_df_clean.shape
Out[47]:
(1460, 50)
In [48]:
numerical_df_clean.head()
Out[48]:
MSSubClass LotFrontage LotArea LotShape LandSlope OverallQual OverallCond YearBuiltAge YearRemodAddAge MasVnrArea ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC MiscVal MoSold YrSoldAge
0 60 65.0 8450 4 3 7 5 20 20 196.0 ... 0 61 0 0 0 0 3.714286 0 2 15
1 20 80.0 9600 4 3 6 8 47 47 0.0 ... 298 0 0 0 0 0 3.714286 0 5 16
2 60 68.0 11250 3 3 7 5 22 21 162.0 ... 0 42 0 0 0 0 3.714286 0 9 15
3 70 60.0 9550 3 3 7 5 108 53 0.0 ... 0 35 272 0 0 0 3.714286 0 2 17
4 60 84.0 14260 3 3 8 5 23 23 350.0 ... 192 84 0 0 0 0 3.714286 0 12 15

5 rows × 50 columns

In [49]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
In [50]:
scaled_numerical_df_clean = scale.fit_transform(numerical_df_clean)
In [51]:
scaled_numerical_df_clean
Out[51]:
array([[ 0.07337496, -0.22937175, -0.20714171, ..., -0.08768781,
        -1.5991111 , -0.13877749],
       [-0.87256276,  0.4519361 , -0.09188637, ..., -0.08768781,
        -0.48911005,  0.61443862],
       [ 0.07337496, -0.09311018,  0.07347998, ..., -0.08768781,
         0.99089135, -0.13877749],
       ...,
       [ 0.30985939, -0.18395123, -0.14781027, ...,  4.95311151,
        -0.48911005, -1.64520971],
       [-0.87256276, -0.09311018, -0.08016039, ..., -0.08768781,
        -0.8591104 , -1.64520971],
       [-0.87256276,  0.22483348, -0.05811155, ..., -0.08768781,
        -0.1191097 , -0.13877749]])
In [52]:
from sklearn.decomposition import PCA
In [53]:
# Make an instance of the Model
pca = PCA(.95)
pca.fit(scaled_numerical_df_clean)

# Find out how many components PCA has after fitting the model
pca.n_components_
Out[53]:
37
In [54]:
# Transform the data into the principal components
principal_components = pca.transform(scaled_numerical_df_clean)
In [55]:
principal_components.shape
Out[55]:
(1460, 37)
In [56]:
principal_components
Out[56]:
array([[ 2.18984101, -0.16763339, -1.47112319, ...,  0.10860845,
         0.07967898,  0.18651098],
       [ 0.04584576, -1.54323387,  1.0932354 , ..., -0.52074135,
         0.980782  ,  0.05223869],
       [ 2.32696368,  0.04969061, -1.34140998, ...,  0.04947632,
        -0.38301589, -0.09240722],
       ...,
       [ 1.40977495,  2.26467125,  0.85626749, ...,  1.17748467,
        -0.66432792, -0.89253948],
       [-2.71136048, -2.80817005,  1.70334363, ...,  0.44461475,
         0.1831076 , -0.55143658],
       [-0.94569008, -1.84532397,  1.62666513, ...,  1.09212882,
         0.45154769, -0.20866037]])
In [57]:
# Create a DataFrame from the principal components
columns = [f'PC{i+1}' for i in range(principal_components.shape[1])]
principal_df = pd.DataFrame(principal_components, columns=columns)
In [58]:
principal_df.head()
Out[58]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 ... PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36 PC37
0 2.189841 -0.167633 -1.471123 -2.262940 1.060092 -0.260601 -0.626813 0.068582 -0.784960 -1.201084 ... -0.129178 -0.060662 0.127597 0.328337 -0.118319 0.273451 -0.573404 0.108608 0.079679 0.186511
1 0.045846 -1.543234 1.093235 -0.001000 -0.789579 -1.115314 1.085005 -0.212486 2.294729 0.171581 ... 1.353092 0.475002 -0.393422 -0.711873 -0.376811 0.245289 0.599826 -0.520741 0.980782 0.052239
2 2.326964 0.049691 -1.341410 -1.890272 0.213914 -0.145710 0.097192 0.821088 -0.401888 -1.050750 ... 0.178433 0.148980 0.628293 0.511766 -0.199613 0.286282 -0.168346 0.049476 -0.383016 -0.092407
3 -0.131933 0.477460 0.602497 -0.474069 -0.403288 -0.902429 -0.835986 -1.001447 -0.538358 0.035704 ... -1.205494 -0.579009 -0.622643 -0.595745 -0.688999 -0.031591 -0.217376 -0.113522 -1.401679 -0.329459
4 4.671188 1.059574 0.364506 -1.838918 0.683339 -0.072945 0.449894 0.733929 0.351705 -0.504662 ... -0.255722 0.251739 -0.116956 0.034451 -0.711869 0.473764 0.147203 0.212041 -0.772847 -0.167668

5 rows × 37 columns

Encode Categorical Variables¶

In [59]:
categorical_df_clean = df_clean.select_dtypes(exclude='number')
In [60]:
categorical_df_clean.shape
Out[60]:
(1460, 29)
In [61]:
categorical_df_clean.head()
Out[61]:
MSZoning Street Alley LandContour Utilities LotConfig Neighborhood Condition1 Condition2 BldgType ... Functional GarageType GarageFinish GarageQual GarageCond PavedDrive Fence MiscFeature SaleType SaleCondition
0 RL Pave NA Lvl AllPub Inside CollgCr Norm Norm 1Fam ... Typ Attchd RFn 3.0 3.0 Y None None WD Normal
1 RL Pave NA Lvl AllPub FR2 Veenker Feedr Norm 1Fam ... Typ Attchd RFn 3.0 3.0 Y None None WD Normal
2 RL Pave NA Lvl AllPub Inside CollgCr Norm Norm 1Fam ... Typ Attchd RFn 3.0 3.0 Y None None WD Normal
3 RL Pave NA Lvl AllPub Corner Crawfor Norm Norm 1Fam ... Typ Detchd Unf 3.0 3.0 Y None None WD Abnorml
4 RL Pave NA Lvl AllPub FR2 NoRidge Norm Norm 1Fam ... Typ Attchd RFn 3.0 3.0 Y None None WD Normal

5 rows × 29 columns

In [62]:
# List to conduct one hot-encode
list_to_encode = categorical_df_clean.columns.to_list()
list_to_encode
Out[62]:
['MSZoning',
 'Street',
 'Alley',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'Heating',
 'Electrical',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']
In [63]:
# Turn categorical data into dummy
categorical_df = pd.get_dummies(categorical_df_clean, columns=list_to_encode)
In [64]:
categorical_df.shape
Out[64]:
(1460, 201)

Conduct Lasso to do feature selection¶

In [65]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(categorical_df, df_clean['SalePrice'], test_size=0.2, random_state=42)
In [66]:
x_train.shape
Out[66]:
(1168, 201)
In [67]:
x_test.shape
Out[67]:
(292, 201)
In [68]:
from sklearn.linear_model import Lasso, Ridge

lasso_model = Lasso(alpha=10)
lasso_model.fit(x_train, y_train)

# Get the selected feature names
selected_feature_names = categorical_df.columns[lasso_model.coef_ != 0]

# Print or use the selected feature names as needed
print("Selected Feature Names:", selected_feature_names)
Selected Feature Names: Index([], dtype='object')
In [69]:
selected_feature_names = selected_feature_names.to_list()

Combine numerical data and categorical data¶

In [70]:
concat_df = pd.concat([principal_df, categorical_df[selected_feature_names]], axis=1)
In [71]:
concat_df.head()
Out[71]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 ... PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36 PC37
0 2.189841 -0.167633 -1.471123 -2.262940 1.060092 -0.260601 -0.626813 0.068582 -0.784960 -1.201084 ... -0.129178 -0.060662 0.127597 0.328337 -0.118319 0.273451 -0.573404 0.108608 0.079679 0.186511
1 0.045846 -1.543234 1.093235 -0.001000 -0.789579 -1.115314 1.085005 -0.212486 2.294729 0.171581 ... 1.353092 0.475002 -0.393422 -0.711873 -0.376811 0.245289 0.599826 -0.520741 0.980782 0.052239
2 2.326964 0.049691 -1.341410 -1.890272 0.213914 -0.145710 0.097192 0.821088 -0.401888 -1.050750 ... 0.178433 0.148980 0.628293 0.511766 -0.199613 0.286282 -0.168346 0.049476 -0.383016 -0.092407
3 -0.131933 0.477460 0.602497 -0.474069 -0.403288 -0.902429 -0.835986 -1.001447 -0.538358 0.035704 ... -1.205494 -0.579009 -0.622643 -0.595745 -0.688999 -0.031591 -0.217376 -0.113522 -1.401679 -0.329459
4 4.671188 1.059574 0.364506 -1.838918 0.683339 -0.072945 0.449894 0.733929 0.351705 -0.504662 ... -0.255722 0.251739 -0.116956 0.034451 -0.711869 0.473764 0.147203 0.212041 -0.772847 -0.167668

5 rows × 37 columns

In [72]:
concat_df.shape
Out[72]:
(1460, 37)
In [73]:
df_clean
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(concat_df, df_clean['SalePrice'], test_size=0.2, random_state=42)

# Train the model - Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42) 
rf.fit(x_train, y_train)

# Test the model
y_pred = rf.predict(x_test)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)
Mean Absolute Error: 0.10416006535399967
Mean Squared Error: 0.024199680182249703
Root Mean Squared Error (RMSE): 0.15556246392446252
R-squared: 0.8703219337513204
In [74]:
import xgboost as xgb

from numpy import loadtxt
from xgboost import XGBRegressor

# fit model no training data
model = XGBRegressor()
model.fit(x_train, y_train)

# Test the model
y_pred = model.predict(x_test)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)
Mean Absolute Error: 0.10884051148169183
Mean Squared Error: 0.02466610410093325
Root Mean Squared Error (RMSE): 0.15705446221274086
R-squared: 0.8678225225454078
In [75]:
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBRegressor

# # Define your XGBoost model
# xgb_model = XGBRegressor()

# # Define the hyperparameter grid
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'n_estimators': [50, 100, 200, 300],
#     'max_depth': [3, 5, 7, 9],
#     'min_child_weight': [1, 3, 5, 7],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'gamma': [0, 0.1, 0.2, 0.3],
#     'scale_pos_weight': [1, 2, 3]
# }

# # Create GridSearchCV object
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# # Fit the model to the data
# grid_search.fit(x_train, y_train)

# # Print the best parameters and corresponding RMSE
# print("Best Parameters: ", grid_search.best_params_)
# print("Best RMSE: ", (-grid_search.best_score_) ** 0.5)

# # Get the best model
# best_xgb_model = grid_search.best_estimator_
In [76]:
# fit model no training data
model = XGBRegressor(
    colsample_bytree=0.8,
    gamma=0,
    learning_rate=0.1,
    max_depth=3,
    min_child_weight=7,
    n_estimators=300,
    scale_pos_weight=1,
    subsample=0.9
)
model.fit(x_train, y_train)


# Test the model
y_pred = model.predict(x_test)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)
Mean Absolute Error: 0.10013308511851164
Mean Squared Error: 0.021651133900594074
Root Mean Squared Error (RMSE): 0.14714324279624286
R-squared: 0.8839787486786836

Prepare Test Data¶

In [77]:
test.isna().sum()
Out[77]:
Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64
In [78]:
test.shape
Out[78]:
(1459, 80)
In [79]:
# ExterQual
mapping = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
test['ExterQual'] = test['ExterQual'].map(mapping)
test['ExterCond'] = test['ExterCond'].map(mapping)
test['HeatingQC'] = test['HeatingQC'].map(mapping)
test['KitchenQual'] = test['KitchenQual'].map(mapping)
test['BsmtQual'] = test['BsmtQual'].map(mapping)
test['BsmtCond'] = test['BsmtCond'].map(mapping)
test['FireplaceQu'] = test['FireplaceQu'].map(mapping)
test['PoolQC'] = test['PoolQC'].map(mapping)


mapping_2 = {'Gd': 3,'Av': 2, 'Mn': 1}
test['BsmtExposure'] = test['BsmtExposure'].map(mapping_2)

mapping_3 = {'GLQ': 6,'ALQ': 5, 'BLQ': 4,'Rec': 3,'LwQ': 2, 'Unf': 1}
test['BsmtFinType1'] = test['BsmtFinType1'].map(mapping_3)
test['BsmtFinType2'] = test['BsmtFinType2'].map(mapping_3)

mapping_4 = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
test['GarageQual'] = test['GarageQual'].map(mapping_4)
test['GarageCond'] = test['GarageCond'].map(mapping_4)

mapping_5 = {'N': 0, 'Y': 1}
test['CentralAir'] = test['CentralAir'].map(mapping_5)

mapping_6 = {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1}
test['LotShape'] = test['LotShape'].map(mapping_6)

mapping_7 = {'Gtl': 3, 'Mod': 2, 'Sev': 1}
test['LandSlope'] = test['LandSlope'].map(mapping_7)
In [80]:
test.shape
Out[80]:
(1459, 80)
In [81]:
test['LotFrontage'].fillna(test['LotFrontage'].mean(), inplace =True)
test['Alley'].fillna('NA', inplace =True)
test['MasVnrType'].fillna('None', inplace =True)
test['MasVnrArea'].fillna(0, inplace =True)
test['BsmtQual'].fillna(test['BsmtQual'].mean(), inplace =True)
test['BsmtCond'].fillna(test['BsmtCond'].mean(), inplace =True)
test['BsmtExposure'].fillna(test['BsmtExposure'].mean(), inplace =True)
test['BsmtFinType1'].fillna(test['BsmtFinType1'].mean(), inplace =True)
test['BsmtFinType2'].fillna(test['BsmtFinType2'].mean(), inplace =True)
test['Electrical'].fillna('None', inplace =True)
test['FireplaceQu'].fillna(test['FireplaceQu'].mean(), inplace =True)
test['GarageType'].fillna('None', inplace =True)
test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean(), inplace =True)
test['GarageFinish'].fillna('None', inplace =True)
test['GarageQual'].fillna('None', inplace =True)
test['GarageCond'].fillna('None', inplace =True)
test['PoolQC'].fillna(test['PoolQC'].mean(), inplace =True)
test['Fence'].fillna('None', inplace =True)
test['MiscFeature'].fillna('None', inplace =True)
test['MSZoning'].fillna('None', inplace =True)
test['SaleType'].fillna('Oth', inplace =True)
test['Utilities'].fillna('None', inplace =True)
test['Exterior1st'].fillna('Other', inplace =True)
test['Exterior2nd'].fillna('Other', inplace =True)
test['BsmtFinSF1'].fillna(0, inplace =True)
test['BsmtFinSF2'].fillna(0, inplace =True)
test['BsmtUnfSF'].fillna(0, inplace =True)
test['TotalBsmtSF'].fillna(0, inplace =True)
test['BsmtFullBath'].fillna(0, inplace =True)
test['BsmtHalfBath'].fillna(0, inplace =True)
test['KitchenQual'].fillna(test['KitchenQual'].mean(), inplace =True)
test['Functional'].fillna('None', inplace =True)
test['GarageCars'].fillna(0, inplace =True)
test['GarageArea'].fillna(0, inplace =True)
In [82]:
test_na = test.isna().sum()
test_na[test_na > 0]
Out[82]:
Series([], dtype: int64)
In [83]:
test_clean = test
In [84]:
test_clean.shape
Out[84]:
(1459, 80)
In [85]:
test_clean['GarageYrBlt'].dtype
test_clean['GarageYrBlt'] = 2023 - test_clean['GarageYrBlt']
test_clean['YearBuilt'] = 2023 - test_clean['YearBuilt']
test_clean['YearRemodAdd'] = 2023 - test_clean['YearRemodAdd']
test_clean['YrSold'] = 2023 - test_clean['YrSold']
In [86]:
test_clean['YearBuilt'].head()
Out[86]:
0    62
1    65
2    26
3    25
4    31
Name: YearBuilt, dtype: int64
In [87]:
test_clean.rename(columns={'GarageYrBlt': 'GarageAge'}, inplace=True)
test_clean.rename(columns={'YearBuilt': 'YearBuiltAge'}, inplace=True)
test_clean.rename(columns={'YearRemodAdd': 'YearRemodAddAge'}, inplace=True)
test_clean.rename(columns={'YrSold': 'YrSoldAge'}, inplace=True)
In [88]:
# test_clean['Log_GarageAge'] = np.log(test_clean['GarageAge'])
# test_clean['Log_YearBuiltAge'] = np.log(test_clean['YearBuiltAge'])
# test_clean['Log_YearRemodAddAge'] = np.log(test_clean['YearRemodAddAge'])
# test_clean['Log_YrSoldAge'] = np.log(test_clean['YrSoldAge'])

# test_clean.drop(columns=['GarageAge', 'YearBuiltAge', 'YearRemodAddAge', 'YrSoldAge'])
In [89]:
test_clean_na = test_clean.isna().sum()
test_clean_na[test_clean_na > 0]
Out[89]:
Series([], dtype: int64)
In [90]:
#test_clean['Log_GarageAge'].fillna(test['Log_GarageAge'].mean(), inplace =True)
In [91]:
test_clean.shape
Out[91]:
(1459, 80)
In [92]:
numerical_test_clean = test_clean.select_dtypes(include='number')
In [93]:
numerical_test_clean = numerical_test_clean.drop(columns=['Id'], axis=1)
In [94]:
numerical_test_clean.shape
Out[94]:
(1459, 50)
In [95]:
numerical_test_clean.head()
Out[95]:
MSSubClass LotFrontage LotArea LotShape LandSlope OverallQual OverallCond YearBuiltAge YearRemodAddAge MasVnrArea ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC MiscVal MoSold YrSoldAge
0 20 80.0 11622 4 3 5 6 62 62 0.0 ... 140 0 0 0 120 0 4.666667 0 6 13
1 20 81.0 14267 3 3 6 6 65 65 108.0 ... 393 36 0 0 0 0 4.666667 12500 6 13
2 60 74.0 13830 3 3 5 5 26 25 0.0 ... 212 34 0 0 0 0 4.666667 0 3 13
3 60 78.0 9978 3 3 6 6 25 25 20.0 ... 360 36 0 0 0 0 4.666667 0 6 13
4 120 43.0 5005 3 3 8 5 31 31 0.0 ... 0 82 0 0 144 0 4.666667 0 1 13

5 rows × 50 columns

In [96]:
scaled_numerical_test_clean = scale.fit_transform(numerical_test_clean)
In [97]:
test_principal_components = pca.transform(scaled_numerical_test_clean)
In [98]:
test_principal_components.shape
Out[98]:
(1459, 37)
In [99]:
columns = [f'PC{i+1}' for i in range(test_principal_components.shape[1])]
test_principal_df = pd.DataFrame(test_principal_components, columns=columns)
In [100]:
test_principal_df.head()
Out[100]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 ... PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36 PC37
0 -2.839040 -1.470161 0.664487 0.779536 -0.663260 0.444999 0.169294 1.049017 -0.625156 -0.273826 ... -0.310042 0.407158 -0.646351 -0.558659 0.486072 0.333655 -0.313075 0.456774 0.923647 -0.150863
1 -1.133623 -0.612153 2.279086 -0.536540 -0.587211 -1.377610 0.556120 2.859048 3.248475 -2.617143 ... -0.902030 -0.339549 -0.888399 0.741832 -0.399486 0.146729 -0.379392 1.920819 -0.020897 -0.402737
2 0.759764 -0.246478 0.006912 -1.955643 0.838348 0.437214 1.329624 0.224668 0.030200 -2.362899 ... 0.261019 -0.570169 -0.114005 -0.428628 0.376088 0.006565 0.077427 -0.774392 0.974675 -1.026337
3 1.445369 0.251309 -0.348032 -1.589184 -0.202136 -0.480743 0.754694 -0.509421 -0.076047 -1.568251 ... 0.013802 -0.099050 -0.044536 -0.814137 -0.280436 -0.618989 0.063194 -0.240648 0.348658 -1.909528
4 1.272475 -0.990319 -2.566121 0.613926 -0.023332 0.447076 -0.555119 0.019079 -1.038859 0.746275 ... -0.969416 -0.122335 -0.828462 0.188991 -0.495817 -0.240772 0.481708 1.247824 0.415142 -0.639205

5 rows × 37 columns

In [101]:
categorical_test_clean = test_clean.select_dtypes(exclude='number')
In [102]:
categorical_test_clean.shape
Out[102]:
(1459, 29)
In [103]:
categorical_test = pd.get_dummies(categorical_test_clean, columns=list_to_encode)
In [104]:
# Assuming 'categorical_test' and 'categorical_df' are your DataFrames
missing_columns = set(categorical_df.columns) - set(categorical_test.columns)

# Add missing columns to 'categorical_test' and fill with zeros
for column in missing_columns:
    categorical_test[column] = 0
In [105]:
# Assuming 'categorical_test' and 'categorical_df' are your DataFrames
extra_columns = set(categorical_test.columns) - set(categorical_df.columns)

# Drop extra columns from 'categorical_test'
categorical_test = categorical_test.drop(columns=extra_columns, errors='ignore')
In [106]:
categorical_test.shape
Out[106]:
(1459, 201)
In [107]:
concat_test = pd.concat([test_principal_df, categorical_test[selected_feature_names]], axis=1)
In [108]:
concat_test.shape
Out[108]:
(1459, 37)
In [109]:
y_pred = rf.predict(concat_test)
In [110]:
y_pred = np.exp(y_pred)
In [111]:
y_pred = pd.DataFrame(y_pred)
In [112]:
test['SalePrice'] = y_pred
In [113]:
submission_4 = test[['Id','SalePrice']]
In [114]:
submission_4.head()
Out[114]:
Id SalePrice
0 1461 123104.018362
1 1462 164577.118041
2 1463 182534.158556
3 1464 201009.454464
4 1465 182511.697001
In [115]:
submission_4.to_csv('submission_4.csv', index=False)
In [ ]: